{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Plate ID</th>\n",
       "      <th>Registration State</th>\n",
       "      <th>Vehicle Make</th>\n",
       "      <th>Violation Time</th>\n",
       "      <th>Street Name</th>\n",
       "      <th>Vehicle Color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>J58JKX</td>\n",
       "      <td>NJ</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>0523P</td>\n",
       "      <td>43 ST</td>\n",
       "      <td>BK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KRE6058</td>\n",
       "      <td>PA</td>\n",
       "      <td>ME/BE</td>\n",
       "      <td>0428P</td>\n",
       "      <td>UNION ST</td>\n",
       "      <td>BLK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>444326R</td>\n",
       "      <td>NJ</td>\n",
       "      <td>LEXUS</td>\n",
       "      <td>0625A</td>\n",
       "      <td>CLERMONT AVENUE</td>\n",
       "      <td>BLACK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>F728330</td>\n",
       "      <td>OH</td>\n",
       "      <td>CHEVR</td>\n",
       "      <td>1106A</td>\n",
       "      <td>DIVISION AVE</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FMY9090</td>\n",
       "      <td>NY</td>\n",
       "      <td>JEEP</td>\n",
       "      <td>1253A</td>\n",
       "      <td>GRAND ST</td>\n",
       "      <td>GREY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Plate ID Registration State Vehicle Make Violation Time      Street Name  \\\n",
       "0   J58JKX                 NJ        HONDA          0523P            43 ST   \n",
       "1  KRE6058                 PA        ME/BE          0428P         UNION ST   \n",
       "2  444326R                 NJ        LEXUS          0625A  CLERMONT AVENUE   \n",
       "3  F728330                 OH        CHEVR          1106A     DIVISION AVE   \n",
       "4  FMY9090                 NY         JEEP          1253A         GRAND ST   \n",
       "\n",
       "  Vehicle Color  \n",
       "0            BK  \n",
       "1           BLK  \n",
       "2         BLACK  \n",
       "3           NaN  \n",
       "4          GREY  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "\n",
    "df = pd.read_csv(filename,\n",
    "                 usecols=['Plate ID',  'Registration State',\n",
    "                        'Vehicle Make', 'Vehicle Color', 'Violation Time', 'Street Name'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12495734"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many rows are in the data frame when it is read into memory?\n",
    "len(df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12048375"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remove rows with any missing data (i.e., a `NaN` value). How many rows remain after doing this pruning? \n",
    "all_good_df = df.dropna()\n",
    "\n",
    "len(all_good_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "447359"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many parking tickets have some missing information, then?\n",
    "len(df.index) - len(all_good_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'$44,735,900'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# If each parking ticket brings $100 into the city, and missing data means that the ticket can be\n",
    "# successfully contested, how much money might New York City lose as a result of such missing data?\n",
    "\n",
    "f'${(len(df.index) - len(all_good_df) ) * 100:,}'  # WOW!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12431949"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's instead assume that a ticket can only be dismissed if the license plate, state, vehicle\n",
    "# make, and/or street name are all there. Remove rows that are missing one or more of these. \n",
    "# How many rows remain?\n",
    "\n",
    "semi_good_df = df.dropna(subset=['Plate ID', 'Registration State', 'Vehicle Make', 'Street Name'])\n",
    "len(semi_good_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "63785"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many rows did we remove?\n",
    "len(df.index) - len(semi_good_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'$6,378,500'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Assuming $100/ticket, how much money would the city lose as as result of this missing data?\n",
    "\n",
    "f'${(len(df.index) - len(semi_good_df.index) ) * 100:,}'   # a bit more reasonable..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12494116"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's instead assume that a ticket can only be dismissed if the license plate, state, and/or\n",
    "# street name are all there. Remove rows that are missing one or more of these. How many rows remain?\n",
    "loosest_df = df.dropna(subset=['Plate ID', 'Registration State', 'Street Name'])\n",
    "len(loosest_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1618"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# How many rows did we remove?\n",
    "len(df.index) - len(loosest_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'$161,800'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Assuming $100/ticket, how much money would the city lose as as result of this missing data?\n",
    "\n",
    "f'${(len(df.index) - len(loosest_df.index) ) * 100:,}'   # ah, much better..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
